QC the data appropriately and then find genes differentially expressed between cells with different genotypes.
scaterload("cs_scater_workshop.RData")
pd <- new("AnnotatedDataFrame", scater_wt_cell)
sce <- newSCESet(countData = scater_wt_counts, phenoData = pd)
sce## SCESet (storageMode: lockedEnvironment)
## assayData: 57865 features, 384 samples
## element names: counts, exprs
## protocolData: none
## phenoData
## sampleNames: S35P1P160125UOXZP21C05 S78P3P160125UOXCP21F10 ...
## S67P1P160125UOXZP21C09 (384 total)
## varLabels: Cell Genotype ... cDNA_ng_per_ul (7 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
Expression values are log2(cpm + 1).
sce <- getBMFeatureAnnos(
sce, filters = "ensembl_gene_id",
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name",
"start_position", "end_position", "strand", "gene_biotype"),
feature_symbol = "hgnc_symbol",
feature_id = "ensembl_gene_id", biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl", host = "www.ensembl.org")
head(fData(sce), n = 1)## ensembl_gene_id hgnc_symbol chromosome_name start_position
## ENSG00000223972 ENSG00000223972 DDX11L1 1 11869
## end_position strand gene_biotype
## ENSG00000223972 14409 1 transcribed_unprocessed_pseudogene
## feature_symbol feature_id
## ENSG00000223972 DDX11L1 ENSG00000223972
head(pData(sce))## Cell Genotype Lab
## S35P1P160125UOXZP21C05 S35P1P160125UOXZP21C05 PSEN1 Zam
## S78P3P160125UOXCP21F10 S78P3P160125UOXCP21F10 PSEN1 Colin_Ackerman
## S44P1P160125UOXZP21D06 S44P1P160125UOXZP21D06 PSEN1 Zam
## S48P2P160125UOXCC21H06 S48P2P160125UOXCC21H06 control Colin_Ackerman
## S71P4P160125UOXZC21G09 S71P4P160125UOXZC21G09 control Zam
## S24P3P160125UOXCP21H03 S24P3P160125UOXCP21H03 PSEN1 Colin_Ackerman
## cell_bulk Induction FACS_Replicate cDNA_ng_per_ul
## S35P1P160125UOXZP21C05 cell 2 1 0.24
## S78P3P160125UOXCP21F10 cell 2 1 0.29
## S44P1P160125UOXZP21D06 cell 2 1 0.36
## S48P2P160125UOXCC21H06 cell 2 1 0.36
## S71P4P160125UOXZC21G09 cell 2 1 0.34
## S24P3P160125UOXCP21H03 cell 2 1 0.58
plot(sce, block1 = "Lab", block2 = "Genotype", colour_by = "cell_bulk",
exprs_values = "counts")#Two sets of feature controls: ERCC spike ins & mitochondrial genes
fctrls <- featureNames(sce)[grepl("ERCC-", featureNames(sce))]
fctrls2 <- featureNames(sce)[grepl("MT", fData(sce)$chromosome_name)]
#Cell controls are bulks and empty wells
cctrls <- cellNames(sce)[pData(sce)$cell_bulk == "bulk"]
#Set threshold of 80% of reads from feature controls to flag cell
sce <- calculateQCMetrics(
sce,
feature_controls = list(ERCC_ctrl = fctrls, mt_ctrl = fctrls2),
cell_controls = list(bulks = cctrls),
nmads = 8, pct_feature_controls_threshold = 80)plotPhenoData(sce, aes(x = pct_counts_top_200_features, y = total_features,
colour = cell_bulk, size = log10_total_counts)) +
geom_hline(yintercept = 2500, linetype = 2) +
geom_vline(xintercept = 50, linetype = 2)plotPhenoData(sce, aes(x = pct_counts_top_200_features, y = cDNA_ng_per_ul,
colour = cell_bulk, size = log10_total_counts)) +
geom_vline(xintercept = 50, linetype = 2) + coord_cartesian(ylim = c(0, 2.2))plotPhenoData(sce, aes(x = pct_counts_top_200_features, y = pct_counts_feature_controls,
colour = cell_bulk, size = log10_total_counts)) +
stat_smooth(colour = "firebrick2", linetype = 2) + geom_hline(yintercept = 13, linetype = 2)sce <- plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "cell_bulk",
pca_data_input = "pdata", detect_outliers = TRUE,
return_SCESet = TRUE, selected_variables = c("cDNA_ng_per_ul", "pct_counts_top_200_features", "total_features", "pct_counts_feature_controls", "n_detected_feature_controls", "log10_counts_endogenous_features", "log10_counts_feature_controls"))plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "cell_bulk",
colour_by = "outlier") + ggtitle("PCA using expression values")plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "Lab",
colour_by = "Genotype") + ggtitle("PCA using expression values")plotPCA(sce, size_by = "pct_counts_top_200_features", shape_by = "Genotype",
colour_by = "Lab") + ggtitle("PCA using expression values")sce$use <- (sce$total_features > 2000 & # sufficient genes detected
sce$pct_counts_top_200_features < 50 & # sufficient library complexity
sce$pct_counts_feature_controls < 14 & # sufficient endogenous RNA
sce$total_counts > 1e5 & # sufficient reads mapped to features
!sce$filter_on_total_features & # remove cells with unusual numbers of genes
!sce$is_cell_control # controls shouldn't be used in downstream analysis
)
knitr::kable(as.data.frame(table(sce$use)))| Var1 | Freq |
|---|---|
| FALSE | 39 |
| TRUE | 345 |
plotPCA(sce[, sce$use], size_by = "pct_counts_top_200_features", shape_by = "Genotype",
colour_by = "Lab") + ggtitle("PCA after filtering cells")plotQC(sce[, sce$use], type = "find-pcs", variable = "Lab" )plotQC(sce[, sce$use], type = "find-pcs", variable = "Genotype" )plotQC(sce, type = "highest-expression", exprs_values = "counts", n = 20) +
theme(axis.text.y = element_text(size = 10),
axis.title = element_text(size = 11))plotQC(sce, type = "exprs-freq-vs-mean", feature_controls = fctrls)keep_gene <- rowMeans(counts(sce)) >= 1
fData(sce)$use <- keep_gene
knitr::kable(as.data.frame(table(keep_gene)))| keep_gene | Freq |
|---|---|
| FALSE | 45058 |
| TRUE | 12807 |
scran package to apply scaling normalisation to the data.isSpike, computeSumFactors, computeSpikeFactors and normaliselibrary(scran)
sce <- sce[fData(sce)$use, sce$use]
isSpike(sce) <- "ERCC_ctrl"
sce <- computeSumFactors(sce, sizes = c(20, 40, 60, 80))
summary(sizeFactors(sce))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3077 0.6856 0.9835 1.0000 1.2540 2.8190
sce <- computeSpikeFactors(sce, type = "ERCC_ctrl", general.use = FALSE)
sce <- normalise(sce, return_norm_as_exprs = FALSE)p1 <- plotPCA(sce, size_by = "total_features", shape_by = "Genotype",
colour_by = "Lab") + ggtitle("PCA before normalisation")
p2 <- plotPCA(sce, size_by = "total_features", shape_by = "Genotype",
colour_by = "Lab", exprs_values = "norm_exprs") + ggtitle("PCA after normalisation")
multiplot(p1, p2, cols = 2)plotQC(sce, type = "expl",
variables = c("pct_counts_top_100_features", "total_features",
"pct_counts_feature_controls", "Lab",
"n_detected_feature_controls", "Genotype",
"log10_counts_endogenous_features",
"log10_counts_feature_controls"))The PCA plots above show that scaling normalisation does not remove batch effects (this is beyond its scope).
Options for dealing with batch effects (in order of preference):
Options:
RUVs method from the RUVSeq package.scater’s normaliseExprs() function to regress out the Lab effect.design <- model.matrix(~sce$Lab)
set_exprs(sce, "norm_exprs_resid") <- norm_exprs(
normaliseExprs(sce, design = design,
method = "none", exprs_values = "exprs",
return_norm_as_exprs = FALSE))p3 <- plotPCA(sce, exprs_values = "norm_exprs_resid", shape_by = "Genotype",
colour_by = "Lab", size_by = "total_features") +
ggtitle("PCA - size-factor normalisation residuals")
multiplot(p2, p3, cols = 2)plotTSNE(sce, exprs_values = "norm_exprs_resid", shape_by = "Lab",
colour_by = "Genotype", size_by = "total_features", rand_seed = 5) +
ggtitle("t-SNE - size-factor normalisation residuals")edgeR [Bioc]: use trend.method="none" argument in estimateDisp callscde [Bioc]M3Drop [Bioc]MAST [devtools::install_github("RGLab/MAST@summarizedExpt"")]BPSC [devtools::install_github(“nghiavtr/BPSC”)]scDD [devtools::install_github(“kdkorthauer/scDD”)]It depends…
We To find genes DE between PSEN1 and control cells accounting for Lab effects (at least). So we want the following in a DE method/tool:
(a good tool should also be reliable, easily available, and well supported)
Choose two.
edgeR: general designs, fast, reliable | negative binomial distribution (unimodal)MAST: general designs, single-cell model | fast, reliable(?)BPSC: general designs, single-cell model | fast(?), reliable(?),scde: single-cell model | fast, reliable(?), general designsscDD: single-cell model (non-parametric) | fast(?), reliable(?), general designs(?)Tips:
convertTo function in scran to convert an SCESet to a DGEList for edgeR analysis. (This retains necessary metadata, including normalisation factors previously computed.)library(edgeR)
dge <- convertTo(sce, "edgeR")
design <- model.matrix(~Lab + Genotype, data = pData(sce))
head(design)## (Intercept) LabZam GenotypePSEN1
## S35P1P160125UOXZP21C05 1 1 1
## S78P3P160125UOXCP21F10 1 0 1
## S44P1P160125UOXZP21D06 1 1 1
## S48P2P160125UOXCC21H06 1 0 0
## S71P4P160125UOXZC21G09 1 1 0
## S24P3P160125UOXCP21H03 1 0 1
dge <- estimateDisp(dge, design, trend.method = "none")plotBCV(dge)fit <- glmFit(dge, design, prior.count = 0.5)
results <- glmLRT(fit)
summary(decideTestsDGE(results))## [,1]
## -1 123
## 0 12439
## 1 202
topTags(results)## Coefficient: GenotypePSEN1
## logFC logCPM LR PValue FDR
## ENSG00000187653 2.0466186 5.485889 198.45372 4.542261e-45 5.797742e-41
## ENSG00000215030 2.2978459 5.617554 195.14951 2.389905e-44 1.525238e-40
## ENSG00000213058 2.3825384 3.833270 180.92401 3.045605e-41 1.295804e-37
## ENSG00000224114 2.0730867 3.652758 156.94785 5.254660e-36 1.676762e-32
## ENSG00000215559 -3.5982078 4.478981 127.41208 1.509414e-29 3.853232e-26
## ENSG00000236876 1.7600141 3.548399 120.69503 4.456324e-28 9.480087e-25
## ENSG00000205542 1.2916156 11.639844 117.30226 2.464891e-27 4.494553e-24
## ENSG00000227097 2.0212006 3.977080 112.21216 3.210497e-26 5.122349e-23
## ENSG00000075624 0.9057414 13.272414 61.73445 3.930429e-15 5.574221e-12
## ENSG00000184009 0.8840725 12.939755 59.31414 1.344073e-14 1.715575e-11
n_de <- sum(abs(decideTestsDGE(results, p.value = 0.001)))
de_tags <- rownames(topTags(results, n = n_de)$table)
plotSmear(results, de.tags = de_tags, smooth.scatter = TRUE,
lowess = TRUE, cex = 0.4)plotExpression(sce, de_tags[1:6], x = "Genotype", colour_by = "Lab", ncol = 3)library("M3Drop")
norm_counts <- t(t(counts(sce)) / sizeFactors(sce))
m3_fit <- M3DropDropoutModels(norm_counts)m3_de <- M3DropDifferentialExpression(norm_counts,
mt_method = "fdr", mt_threshold = 0.01)heat_out <- M3DropExpressionHeatmap(m3_de$Gene, norm_counts,
cell_labels = sce$Genotype)heat_out <- M3DropExpressionHeatmap(m3_de$Gene, norm_counts,
cell_labels = sce$Lab)Beware batch effects!
In principle, the below should work…but it did not for me.
As it is not on CRAN or Bioconductor, there is no established forum for asking questions and obtaining advice, and (it would appear here) that it is not as reliable as it should be.
eset <- ExpressionSet(assayData(sce))
exprs(eset) <- get_exprs(sce, "exprs")
eset$condition <- as.numeric(as.factor(sce$Genotype))
sdd_res <- scDD(eset, testZeroes = FALSE)I’ll leave it with you to explore further options for DE testing. Many good tools are available, but it’s still the wild west out there.